Code
# Load required packages
library(dplyr)
library(tidyr)
library(pheatmap)
library(ggplot2)
library(reshape2)
library(patchwork)
library(ggrepel)
library(limma)
library(tibble)
library(here)
library(DT)
library(htmltools)Collaboration with the Bio-Chemistry group
# Load required packages
library(dplyr)
library(tidyr)
library(pheatmap)
library(ggplot2)
library(reshape2)
library(patchwork)
library(ggrepel)
library(limma)
library(tibble)
library(here)
library(DT)
library(htmltools)Using file:
/home/ager/bioinf02/ager/collab/bioquimica/bmi_proteomics/inputs/2025PROT002-70.xlsx
Warning: Expecting numeric in D4 / R4C4: got a date
summary(meta_pat) id Num Edad IMC Sexo
NW1 : 1 Min. : 1.00 Min. :35.00 Min. : 19.00 F: 7
NW2 : 1 1st Qu.: 17.25 1st Qu.:61.50 1st Qu.: 25.41 M:17
NW3 : 1 Median : 70.50 Median :68.50 Median : 27.75
NW4 : 1 Mean : 63.67 Mean :64.62 Mean : 1891.62
NW5 : 1 3rd Qu.: 92.75 3rd Qu.:72.00 3rd Qu.: 32.50
NW6 : 1 Max. :138.00 Max. :75.00 Max. :44733.00
(Other):18
Otros group
DM: 9 NW:7
No:15 OB:8
OW:9
meta_pat$IMC[meta_pat$id == "NW3"] <- 21.6
summary(meta_pat) id Num Edad IMC Sexo Otros
NW1 : 1 Min. : 1.00 Min. :35.00 Min. :19.00 F: 7 DM: 9
NW2 : 1 1st Qu.: 17.25 1st Qu.:61.50 1st Qu.:25.00 M:17 No:15
NW3 : 1 Median : 70.50 Median :68.50 Median :27.35
NW4 : 1 Mean : 63.67 Mean :64.62 Mean :28.64
NW5 : 1 3rd Qu.: 92.75 3rd Qu.:72.00 3rd Qu.:31.55
NW6 : 1 Max. :138.00 Max. :75.00 Max. :39.00
(Other):18
group
NW:7
OB:8
OW:9
knitr::kable(
meta_pat %>%
group_by(group) %>%
select(-Num) %>%
summarise(across(where(is.numeric),
list(mean = mean, sd = sd, min = min, max = max),
.names = "{.col}_{.fn}")))| group | Edad_mean | Edad_sd | Edad_min | Edad_max | IMC_mean | IMC_sd | IMC_min | IMC_max |
|---|---|---|---|---|---|---|---|---|
| NW | 53.85714 | 15.214342 | 35 | 72 | 23.500 | 2.338803 | 19.00 | 25.0 |
| OB | 66.00000 | 6.502747 | 55 | 75 | 34.475 | 3.002261 | 30.80 | 39.0 |
| OW | 71.77778 | 3.492055 | 65 | 75 | 27.460 | 1.510265 | 25.54 | 29.8 |
Objective: Check the group imbalance respect the variables Sex and DM.
chi_square_group <- function(var) {
tab <- table(meta_pat$group, meta_pat[[var]])
expected <- chisq.test(tab)$expected
# If some expected < 5 → Fisher
if (any(expected < 5)) {
test <- fisher.test(tab)
method <- "Fisher Exact test"
} else {
test <- chisq.test(tab)
method <- "Chi-square test"
}
list(
variable = var,
method = method,
table = tab,
p_value = test$p.value
)
}
res_sexo <- chi_square_group("Sexo")
res_otros <- chi_square_group("Otros")
res_sexo$variable
[1] "Sexo"
$method
[1] "Fisher Exact test"
$table
F M
NW 1 6
OB 2 6
OW 4 5
$p_value
[1] 0.5378961
The distribution of sex across groups does not seems imbalanced.
res_otros$variable
[1] "Otros"
$method
[1] "Fisher Exact test"
$table
DM No
NW 0 7
OB 7 1
OW 2 7
$p_value
[1] 0.0005690231
The distribution of DM across groups is highly unbalanced (Fisher’s exact test p = 5.7×10⁻⁴), indicating a strong and statistically significant association between having diabetes and overweight.
Confounding Bias: Having an unbalanced group means that is not possible to know if the study results are really associated to the phenotype of interest (Obessity & Over Weight) or to the counfounding variable (DM), as there is no way to separate the effect of these two variables.
# Add groups to see if cluster together
annotation_col <- data.frame(Group = factor(meta_pat$group))
rownames(annotation_col) <- colnames(expr_ALL)
pheatmap(expr_ALL,
color = colorRampPalette(c("blue", "yellow"))(100),
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = FALSE,
annotation_col = annotation_col,
main = "Expression heatmap")# Compute variance for each protein
gene_var <- apply(expr_ALL, 1, var)
# Select top 100 most variable proteins
top100_genes <- names(sort(gene_var, decreasing = TRUE))[1:100]
expr_top100 <- expr_ALL[top100_genes, ]
pheatmap(expr_top100,
color = colorRampPalette(c("blue", "yellow"))(100),
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = FALSE,
annotation_col = annotation_col,
main = "Expression heatmap (Top 100 most variable proteins)")It can be seen how there is no a clear clusterization of patients. However if we check the k=3 clusters, we can see how it seems that all the control grouo falls in almost in the same group. We can check further if there is some key aspects about this differentiation.
It can be seen how the OW8 and the OB8 patients seem to have different protein distribution compared to the rest of the opatients. Could be interesting to see if there are some other variables affecting to this differentiation.
res <- pheatmap(expr_top100, silent = TRUE)
col_clusters <- cutree(res$tree_col, k = 3)
# Add col_cluster to dataset
meta_pat$k3 <- as.factor(col_clusters)
# 0) Prepare a DM variable robustly
if ("DM" %in% names(meta_pat)) {
meta_pat <- meta_pat %>% mutate(DM_var = as.factor(DM))
} else {
# try to detect 'DM' mention inside Otros (case-insensitive)
if ("Otros" %in% names(meta_pat)) {
meta_pat <- meta_pat %>%
mutate(DM_var = ifelse(grepl("\\bDM\\b|diabete|diabetes", Otros, ignore.case = TRUE),
"yes", "no"),
DM_var = factor(DM_var, levels = c("no", "yes")))
} else {
# fallback: create an NA column so code still runs
meta_pat <- meta_pat %>% mutate(DM_var = factor(NA))
warning("No DM or Otros column found: DM_var set to NA for all rows.")
}
}
# Quick contingency tables (printed)
# cat("Counts: group by k3\n")
# print(table(meta_pat$k3, meta_pat$group))
# cat("\nCounts: Sexo by k3\n")
# print(table(meta_pat$k3, meta_pat$Sexo))
# cat("\nCounts: DM_var by k3\n")
# print(table(meta_pat$k3, meta_pat$DM_var))
# 1) Bar plot: group per k3 (counts)
p1 <- p1 <- ggplot(meta_pat, aes(x = k3, fill = group)) +
geom_bar(position = position_dodge(width = 0.8)) +
scale_fill_manual(values = c(
"NW" = "steelblue",
"OW" = "orange",
"OB" = "firebrick"
)) +
labs(title = "Counts of NW / OW / OB per cluster (k3)",
x = "Cluster (k3)",
y = "Count") +
theme_minimal() +
theme(legend.position = "right")
# 2) Bar plot: Sexo per k3
p2 <- ggplot(meta_pat, aes(x = k3, fill = Sexo)) +
geom_bar(position = position_dodge(width = 0.8)) +
scale_fill_manual(values = c(
"F" = "#C77CFF", # lilac
"M" = "#2E8B57" # green (sea green)
)) +
labs(title = "Counts of Sex per cluster (k3)",
x = "Cluster (k3)",
y = "Count") +
theme_minimal() +
theme(legend.position = "right")
# 3) Bar plot: DM (detected) per k3
# If DM_var is all NA, show an informative plot (empty)
if (all(is.na(meta_pat$DM_var))) {
p3 <- ggplot() +
annotate("text", x = 1, y = 1, label = "No DM information available", size = 5) +
labs(title = "DM counts per k3") +
theme_void()
} else {
p3 <- ggplot(meta_pat, aes(x = k3, fill = DM_var)) +
geom_bar(position = position_dodge(width = 0.8)) +
scale_fill_manual(values = c(
"no" = "#87CEEB", # soft green
"yes" = "#808080" # muted purple
)) +
labs(title = "Counts of DM per cluster (k3)",
x = "Cluster (k3)",
y = "Count",
fill = "DM") +
theme(legend.position = "right")
}
# 4) Boxplot: Edad by k3
p4 <- ggplot(meta_pat, aes(x = k3, y = Edad)) +
geom_boxplot() +
geom_jitter(width = 0.2, size = 1, alpha = 0.7) +
labs(title = "Age (Edad) distribution by cluster (k3)",
x = "Cluster (k3)",
y = "Edad (years)") +
theme_minimal()
# Combine into 2x2 layout
combined_plot <- (p1 | p2) / (p3 | p4) +
plot_annotation(title = "Summary of clusters (k3) by group / sex / DM / age")
# Print the combined plot
print(combined_plot)# Optionally save
# ggsave("k3_summary_2x2.png", combined_plot, width = 12, height = 9, dpi = 300)We compute the average expression difference between the three clustered groups and we ploted heat maps with the Top 20 proteins with higher average differences between groups. This way is possible to see which proteins were influenciating the separation of these clusters.
# Required packages
library(dplyr)
library(pheatmap)
# Make sure sample names line up (coerce id to character)
meta_pat <- meta_pat %>% mutate(id = as.character(id))
expr_top100 <- expr_top100[, meta_pat$id] # reorder columns to match meta_pat
# Ensure k3 is a factor with sensible ordering
k3 <- factor(meta_pat$k3, levels = sort(unique(meta_pat$k3)))
# 1) Compute mean expression per protein per k3
cluster_means <- sapply(levels(k3), function(cl) {
rowMeans(expr_top100[, meta_pat$k3 == cl, drop = FALSE])
})
colnames(cluster_means) <- paste0("k3_", levels(k3))
# 2) Compute the difference (max - min) across clusters for each protein
range_diff <- apply(cluster_means, 1, function(x) max(x) - min(x))
# 3) Select top 20 proteins by that range
top20 <- names(sort(range_diff, decreasing = TRUE))[1:20]
# 4) Build a summary table for the top20
top20_table <- data.frame(
protein = top20,
range = range_diff[top20],
max_cluster = apply(cluster_means[top20, , drop = FALSE], 1, function(x) {
levels(k3)[which.max(x)]
}),
stringsAsFactors = FALSE
)
# append cluster means as columns
top20_table <- cbind(top20_table, as.data.frame(cluster_means[top20, , drop = FALSE]))
top20_table <- top20_table %>% arrange(desc(range))
# Print the summary table
knitr::kable(top20_table)| protein | range | max_cluster | k3_1 | k3_2 | k3_3 | |
|---|---|---|---|---|---|---|
| Q8WZ42 | Q8WZ42 | 7.298724 | 3 | 4.5093720 | 6.3863886 | 11.808096 |
| P69892 | P69892 | 7.291471 | 3 | 0.9324249 | 1.2647962 | 8.223896 |
| P02730 | P02730 | 7.181224 | 3 | -0.3973903 | 0.9777581 | 6.783834 |
| P30043 | P30043 | 6.678449 | 3 | 2.6667735 | 2.1566507 | 8.835099 |
| P00918 | P00918 | 6.339924 | 3 | 3.8411256 | 3.2191613 | 9.559085 |
| P13861 | P13861 | 6.328053 | 3 | 4.8866951 | 4.9019790 | 11.214748 |
| P16157 | P16157 | 6.323038 | 3 | -1.2977679 | -1.3639409 | 4.959097 |
| P00915 | P00915 | 6.195384 | 3 | 5.9679439 | 5.6336189 | 11.829002 |
| P02042 | P02042 | 6.141251 | 3 | 7.8860207 | 7.8621365 | 14.003388 |
| P02549 | P02549 | 6.121085 | 3 | -0.0571943 | -0.2944001 | 5.826685 |
| P69905 | P69905 | 6.020232 | 3 | 9.9504515 | 10.0643913 | 15.970684 |
| P37840 | P37840 | 5.918742 | 3 | -0.5766669 | -0.3604814 | 5.342076 |
| P11277 | P11277 | 5.607273 | 3 | -1.4893687 | -1.5724950 | 4.034778 |
| P07738 | P07738 | 5.603473 | 3 | 0.4828676 | 0.3078136 | 5.911287 |
| P32119 | P32119 | 5.578854 | 3 | 6.6386089 | 6.5951686 | 12.174023 |
| Q9NTK5 | Q9NTK5 | 5.465644 | 3 | -1.5102646 | -2.4767640 | 2.988880 |
| P15169;CON__Q2KJ83 | P15169;CON__Q2KJ83 | 5.386589 | 3 | 0.6718884 | -1.8583186 | 3.528271 |
| P02751 | P02751 | 5.386556 | 3 | 3.8910747 | 1.2564756 | 6.643031 |
| P68871 | P68871 | 5.373292 | 3 | 10.8858563 | 10.8529311 | 16.226223 |
| P10145 | P10145 | 5.280836 | 3 | 0.7318161 | 0.8538745 | 6.012652 |
# 5) Order columns for heatmap: by k3, then by sample mean across top20
sample_means_top20 <- colMeans(expr_top100[top20, , drop = FALSE])
samp_df <- data.frame(id = colnames(expr_top100),
k3 = meta_pat$k3,
sample_mean = sample_means_top20,
stringsAsFactors = FALSE)
samp_df <- samp_df %>% arrange(k3, sample_mean)
sample_order <- samp_df$id
# 6) Prepare annotation for columns
annotation_col <- data.frame(k3 = factor(meta_pat$k3, levels = levels(k3)),
group = meta_pat$group)
rownames(annotation_col) <- meta_pat$id
# Optional: annotation colors (pick any palette you like)
annotation_colors <- list(
k3 = setNames(c("#F8766D", "#00BA38", "#619CFF")[seq_along(levels(k3))], levels(k3)),
group = c("NW" = "steelblue", "OW" = "orange", "OB" = "firebrick")
)
# 7) Plot pheatmap: scale rows to emphasize differences across proteins
pheatmap(expr_top100[top20, sample_order],
scale = "row",
cluster_rows = TRUE,
cluster_cols = FALSE, # columns are already ordered by k3 & sample mean
show_rownames = TRUE,
show_colnames = FALSE,
annotation_col = annotation_col,
annotation_colors = annotation_colors,
color = colorRampPalette(c("blue", "white", "red"))(100),
main = "Top 20 proteins (max-min across clusters)")In this plot we can see how the higher differences are influenced mainly by group 3, as these two patients have significantly different protein patterns compared to the other groups. Because of this reason we consider to check the differences between group1 vs group 2 and repeat the process.
# 2) Compute the difference (max - min) across clusters for each protein
range_diff <- abs(cluster_means[, "k3_2"] - cluster_means[, "k3_1"])
# 3) Select top 20 proteins by that range
top20 <- names(sort(range_diff, decreasing = TRUE))[1:20]
# 4) Build a summary table for the top20
top20_table <- data.frame(
protein = top20,
range = range_diff[top20],
max_cluster = apply(cluster_means[top20, , drop = FALSE], 1, function(x) {
levels(k3)[which.max(x)]
}),
stringsAsFactors = FALSE
)
# append cluster means as columns
top20_table <- cbind(top20_table, as.data.frame(cluster_means[top20, , drop = FALSE]))
top20_table <- top20_table %>% arrange(desc(range))
# Print the summary table
knitr::kable(top20_table)| protein | range | max_cluster | k3_1 | k3_2 | k3_3 | |
|---|---|---|---|---|---|---|
| P02788 | P02788 | 4.002797 | 3 | 2.3405930 | -1.6622044 | 2.7402993 |
| P08246 | P08246 | 3.583931 | 3 | 2.9892275 | -0.5947034 | 3.3125898 |
| P80188 | P80188 | 3.072084 | 3 | 2.0410812 | -1.0310029 | 3.1651088 |
| P59665;P59666 | P59665;P59666 | 3.035390 | 1 | 3.4400398 | 0.4046500 | 3.1733667 |
| P24158 | P24158 | 3.008182 | 3 | 1.4624522 | -1.5457294 | 3.4134294 |
| P14780 | P14780 | 2.843444 | 3 | 0.4037902 | -2.4396535 | 0.5919944 |
| P02751 | P02751 | 2.634599 | 3 | 3.8910747 | 1.2564756 | 6.6430314 |
| P15169;CON__Q2KJ83 | P15169;CON__Q2KJ83 | 2.530207 | 3 | 0.6718884 | -1.8583186 | 3.5282708 |
| P05164 | P05164 | 2.500001 | 3 | 0.7465525 | -1.7534488 | 2.0963375 |
| O95445 | O95445 | 2.466762 | 3 | 1.2830546 | -1.1837070 | 2.1669036 |
| P04114 | P04114 | 2.436852 | 3 | 5.3888056 | 2.9519536 | 7.9308293 |
| P05109 | P05109 | 2.433686 | 3 | 2.1356457 | -0.2980399 | 3.6859514 |
| P00739 | P00739 | 2.375068 | 3 | 1.7233332 | -0.6517352 | 2.6526386 |
| P02654 | P02654 | 2.262448 | 3 | 1.7374281 | -0.5250195 | 4.4096442 |
| P07711 | P07711 | 2.230401 | 3 | 2.5638309 | 0.3334304 | 2.6249792 |
| P27169 | P27169 | 2.183685 | 3 | 3.2379735 | 1.0542886 | 6.0283778 |
| P27797 | P27797 | 2.147894 | 2 | -0.0220323 | 2.1258615 | 2.1045362 |
| P30740 | P30740 | 2.092266 | 3 | 1.3038032 | -0.7884624 | 3.6459695 |
| P02656 | P02656 | 2.061566 | 1 | 2.5913551 | 0.5297889 | 2.5404083 |
| P00738 | P00738 | 2.042086 | 3 | 8.2949022 | 6.2528164 | 11.2897644 |
# 5) Order columns for heatmap: by k3, then by sample mean across top20
sample_means_top20 <- colMeans(expr_top100[top20, , drop = FALSE])
samp_df <- data.frame(id = colnames(expr_top100),
k3 = meta_pat$k3,
sample_mean = sample_means_top20,
stringsAsFactors = FALSE)
samp_df <- samp_df %>% arrange(k3, sample_mean)
sample_order <- samp_df$id
# 6) Prepare annotation for columns
annotation_col <- data.frame(k3 = factor(meta_pat$k3, levels = levels(k3)),
group = meta_pat$group)
rownames(annotation_col) <- meta_pat$id
# Optional: annotation colors (pick any palette you like)
annotation_colors <- list(
k3 = setNames(c("#F8766D", "#00BA38", "#619CFF")[seq_along(levels(k3))], levels(k3)),
group = c("NW" = "steelblue", "OW" = "orange", "OB" = "firebrick")
)
# 7) Plot pheatmap: scale rows to emphasize differences across proteins
pheatmap(expr_top100[top20, sample_order],
scale = "row",
cluster_rows = TRUE,
cluster_cols = FALSE, # columns are already ordered by k3 & sample mean
show_rownames = TRUE,
show_colnames = FALSE,
annotation_col = annotation_col,
annotation_colors = annotation_colors,
color = colorRampPalette(c("blue", "white", "red"))(100),
main = "Top 20 proteins (max-min across group 1 vs group 2)")We plot the protein distribution across patients in order to see if the expression pattern is similar or not.
# Add probe IDs as a column
exprs_ALL_df <- data.frame(Protein = rownames(expr_ALL), expr_ALL)
# Melt the data frame into long format
expr_long <- reshape2::melt(exprs_ALL_df, id.vars = "Protein", variable.name = "patient", value.name = "Expression")
# Add group variable
expr_long <- expr_long %>%
dplyr::mutate(group = as.factor(substr(patient, 1,2)))
# Density plot
a <- ggplot(expr_long, aes(x = Expression, color = patient)) +
geom_density() +
theme_bw() +
ggtitle("Expression density distributions per patient")
b <- ggplot(expr_long, aes(x = Expression, fill = 1)) +
geom_density(alpha = 0.5) +
theme_bw() +
theme(legend.position = "none") +
ggtitle("General expression density distributions")
# Density plot
c <- ggplot(expr_long, aes(x = Expression, color = group, fill = group)) +
geom_density(alpha = 0.1) +
theme_bw() +
ggtitle("Expression density distributions per group")
# Density plot
d <- ggplot(expr_long, aes(x = Expression, color = group, fill = group)) +
geom_boxplot(alpha = 0.1) +
theme_bw() +
ggtitle("Expression distributions' boxplots per group")
final_plot <- (a|b) / (c|d)
ggsave(
filename = "results/expression_density_combined.png",
plot = final_plot,
width = 12, # adjust as needed
height = 10, # adjust as needed
dpi = 300
)We also check the distribution of protein expression and the standard deviation of them acorss patients.
# Compute mean and SD across patients for each protein
protein_stats <- exprs_ALL_df %>%
tidyr::pivot_longer(-Protein, names_to = "patient", values_to = "Expression") %>%
dplyr::group_by(Protein) %>%
dplyr::summarise(
mean_expr = mean(Expression, na.rm = TRUE),
sd_expr = sd(Expression, na.rm = TRUE)
)
# Density plot of mean expression values
a <- ggplot(protein_stats, aes(x = mean_expr)) +
geom_density(fill = "steelblue", alpha = 0.4) +
theme_bw() +
ggtitle("Density of Mean Expression Per Protein") +
xlab("Mean Expression")
# Scatter plot: SD vs mean per protein
b <- ggplot(protein_stats, aes(x = mean_expr, y = sd_expr)) +
geom_point(alpha = 0.6) +
theme_bw() +
ggtitle("SD vs Mean Expression per Protein") +
xlab("Mean Expression") +
ylab("Standard Deviation")
a | bAs we have seen that there are two individuals that can clearly be differentiated with the rest in the heat map, we decide to remove them: OB8 and OW8
It is imporatnt to know where are the outliers coming from. We decide to eliminate because we saw that the difference of these samples with the rest of the population is significant and can be caused by an artifact, but it is neccesary to determine the origin of this bias in order to trust the results of the analysis.**
# Eliminate it from the exor_ALL matrix
expr_ALL <- expr_ALL[,!colnames(expr_ALL) %in% c("OW8", "OB8")]
meta_pat <- meta_pat[!meta_pat$id %in% c("OW8", "OB8"),]This section performs a principal component analysis to explore global patterns in the protein expression data. First, the sample metadata is reordered so that it matches the order of samples in the expression matrix. This is essential to ensure that each sample is correctly annotated. The expression matrix is then transposed so that samples are treated as observations and proteins as variables, which is required for principal component analysis. The analysis reduces thousands of protein measurements into a small number of components that summarize the main sources of variation in the data.
If the variable is numeric, such as age, samples are colored along a continuous scale to show gradients across the data. If the variable is categorical, such as group or sex, samples are colored by category and confidence ellipses are added to visualize clustering tendencies. This approach helps evaluate whether samples with shared characteristics tend to group together in the global expression space. The resulting plots are arranged into a single figure for side by side comparison.
# Ensure metadata is ordered to match expression matrix
meta_pat <- meta_pat[match(colnames(expr_ALL), meta_pat$id), ]
# Transpose so samples are rows for PCA
pca_res <- prcomp(t(expr_ALL), scale. = TRUE)
# build PCA dataframe
pca_df <- data.frame(
PC1 = pca_res$x[, 1],
PC2 = pca_res$x[, 2],
PC3 = pca_res$x[, 3],
PC4 = pca_res$x[, 4],
id = meta_pat$id,
group = meta_pat$group,
Sexo = meta_pat$Sexo,
Otros = meta_pat$Otros,
Edad = meta_pat$Edad
)
# Create PC pairs
pc_pairs <- list(
c("PC1","PC2"),
c("PC1","PC3"),
c("PC1","PC4"),
c("PC2","PC3"),
c("PC2","PC4"),
c("PC3","PC4")
)
# Meta variables
color_vars <- c("Sexo", "Otros", "Edad", "group")
# Function to create pca_plot
make_pca_plot <- function(df, pcs, color_var) {
ggplot(df, aes_string(x = pcs[1], y = pcs[2], color = color_var)) +
geom_point(size = 3) +
geom_text_repel(aes(label = id), size = 3) +
theme_bw() +
ggtitle(paste0(pcs[1], " vs ", pcs[2], " colored by ", color_var)) +
xlab(paste0(pcs[1], " (", round(summary(pca_res)$importance[2, as.numeric(substr(pcs[1],3,3))] * 100, 1), "%)")) +
ylab(paste0(pcs[2], " (", round(summary(pca_res)$importance[2, as.numeric(substr(pcs[2],3,3))] * 100, 1), "%)"))
}
# Generate plot list
plots <- list()
# Create each pca plot
for (pc in pc_pairs) {
for (colvar in color_vars) {
p <- make_pca_plot(pca_df, pc, colvar)
plots <- append(plots, list(p))
}
}
# Combinar todos los gráficos en una cuadrícula
final_plot <- wrap_plots(plots, ncol = 2)
# Save PCA plots
ggsave("results/PCA_all_combinations.png",
final_plot,
width = 16, height = 40, dpi = 300)
# print(final_plot) We can see how the PC4 (for example when plotting it against PC2) seems to capturate a bit the effect of the sex in the population. These indications determine the need to correct the analyses by gender.
# Add groups to see if cluster together
library(factoextra)
library(ggpubr)
library(ggplot2)
# PCA with factoextra
res.pca <- prcomp(t(expr_ALL), scale.=TRUE, center=TRUE)
# Variables wanted to plot:
vars <- c("group", "Sexo", "Otros", "Edad")
plots <- list()
for (v in vars) {
vec <- meta_pat[[v]]
# If numeric variable:
if (is.numeric(vec)) {
p <- fviz_pca_ind(res.pca,
addEllipses = FALSE,
repel = TRUE,
label = "none"
) +
geom_point(aes(color = vec), size = 3) +
scale_color_viridis_c(option = "plasma") +
labs(color = v, title = paste("PCA grouped by", v))
} else {
# Categorical variable:
p <- fviz_pca_ind(res.pca,
addEllipses = TRUE,
ellipse.level = 0.95,
repel = TRUE,
palette = "Dark2",
habillage = vec,
label = "none"
) +
labs(title = paste("PCA grouped by", v))
}
# Save plot
plots[[v]] <- p
}
# Combine 4 plots:
ggarrange(
plots$group, plots$Sexo, plots$Otros, plots$Edad,
# labels = vars,
ncol = 2, nrow = 2
)Now, we are going to prepare the data for statistical modeling of individual protein expression levels. The metadata is again aligned with the expression matrix to guarantee correct sample matching. Body mass index and age are transformed in two ways. First, they are centered (in 0) to make model interpretation more intuitive. Second, they are standardized so that all numeric predictors are on comparable scales, which improves model stability. The objective is to analyze how protein levels changes if the bmi changes.
A preliminary linear model is fitted to evaluate collinearity among predictors. Variance inflation factors and diagnostic plots are used to check whether predictors provide redundant information. This step ensures that subsequent models are statistically sound and interpretable.
library(car)
# reorder metadata to match expr_ALL columns
meta_pat <- meta_pat[match(colnames(expr_ALL), meta_pat$id), ]
# basic check
stopifnot(all(colnames(expr_ALL) == meta_pat$id))
# Optionally center continuous covariates (helps interpret intercepts)
meta_pat$IMC_c <- scale(meta_pat$IMC, center = TRUE, scale = FALSE)[,1]
meta_pat$Edad_c <- scale(meta_pat$Edad, center = TRUE, scale = FALSE)[,1]
# Scale numeric covariates to be on similar scales:
# Center and scale (z-score)
meta_pat$IMC_s <- scale(meta_pat$IMC, center = TRUE, scale = TRUE)[,1]
meta_pat$Edad_s <- scale(meta_pat$Edad, center = TRUE, scale = TRUE)[,1]
# Consider looking for colinarity between predictors:
model_vif <- lm(IMC ~ Edad + Sexo + group + Otros, data=meta_pat)
vif(model_vif) GVIF Df GVIF^(1/(2*Df))
Edad 1.807965 1 1.344606
Sexo 1.189817 1 1.090787
group 3.717756 2 1.388578
Otros 2.218370 1 1.489419
plot(model_vif, which = 1, main = "Model Fit")In this case, we drop group (it does not reach vif value = 5, but it’s ‘measuring’ the same as the bmi).
After that, a multivariable linear model is fitted to test the association between protein expression and body mass index while adjusting for age, sex, and other covariates. A design matrix is constructed that encodes these predictors in a form suitable for high dimensional modeling. The limma framework is then applied to fit the model to all proteins simultaneously.
Empirical Bayes moderation is used to improve variance estimation, which is particularly important when sample sizes are limited. Results for the body mass index effect are extracted for all proteins and corrected for multiple testing. A volcano plot is generated to visualize both the magnitude and statistical significance of associations. Proteins with stronger and more significant associations are highlighted and labeled. The plot is saved for inclusion in the report.
# IMC:
### Build design matrix
# Model: Expression ~ IMC + Edad + Sexo
# (Intercept corresponds to reference level of Sexo)
design_bmi <- model.matrix(~ IMC_s + Edad_s + Sexo + Otros, data = meta_pat) # Add DB and scaled data:
colnames(design_bmi)[1] "(Intercept)" "IMC_s" "Edad_s" "SexoM" "OtrosNo"
# Check design
print(head(design_bmi)) (Intercept) IMC_s Edad_s SexoM OtrosNo
1 1 -0.8862474 0.31435176 1 1
2 1 -0.6753274 0.06133693 1 1
3 1 -1.3924553 -1.28807550 1 1
4 1 -1.9408472 -2.46881138 1 1
5 1 -0.6964194 -2.46881138 1 1
6 1 -0.6753274 0.65170487 0 1
## Fit limma
# expr_ALL must be numeric matrix
expr_mat <- as.matrix(expr_ALL)
fit_bmi <- lmFit(expr_mat, design_bmi)
fit_bmi <- eBayes(fit_bmi, trend = TRUE, robust = TRUE) # Use empirical Bayes with robust options and trend
### Extract results for BMI (IMC_c)
# Determine the exact coefficient name for IMC_c in design
coef_name <- "IMC_s"
if(!coef_name %in% colnames(design_bmi)) stop("Coefficient name for BMI not found in design.")
res_bmi <- topTable(fit_bmi, coef = coef_name, number = Inf, sort.by = "P") # res contains logFC (effect per unit of IMC_c), AveExpr, t, P.Value, adj.P.Val, B# Add proteinID as a column
res_bmi$proteinID <- rownames(res_bmi)
# Optional: move proteinID to first column
res_bmi <- res_bmi[, c("proteinID", setdiff(colnames(res_bmi), "proteinID"))]
dt_bmi <- datatable(
res_bmi,
options = list(
pageLength = 5,
initComplete = JS(
"function(settings, json) {",
"$('div.dataTables_wrapper').find('table').css('font-size', '14px');",
"$('div.dataTables_wrapper').find('caption').css({'font-size': '16px', 'font-weight': 'bold'});",
"}"
),
columnDefs = list(
list(
targets = "_all",
render = JS(
"function(data, type, row, meta) {",
" if (typeof data === 'number') {",
" if (meta.col == 4 || meta.col == 5) {",
" return parseFloat(data).toExponential(2);",
" } else {",
" return parseFloat(data).toFixed(3);",
" }",
" } else {",
" return data;",
" }",
"}"
)
)
)
),
rownames = FALSE,
caption = "BMI multivariate analysis results"
)
browsable(tagList(dt_bmi))### 5. Add protein names column (rownames are proteins)
res_bmi <- res_bmi %>%
tibble::rownames_to_column(var = "Protein") %>%
as.data.frame()
### 6. Volcano plot (ggplot2) - custom
# choose thresholds (you can change)
logFC_thr <- 0.2 # example: proteins with abs(logFC) > 0.2 considered biologically relevant (adjust as you wish)
padj_thr <- 0.05
res_bmi$significant <- with(res_bmi, (adj.P.Val < padj_thr) & (abs(logFC) > logFC_thr))
volc_df_bmi <- res_bmi %>%
mutate(minusLog10P = -log10(P.Value + 1e-300)) # avoid -Inf
p_volcano_bmi <- ggplot(volc_df_bmi, aes(x = logFC, y = minusLog10P)) +
geom_point(aes(color = significant), alpha = 0.7) +
scale_color_manual(values = c("grey50", "red")) +
geom_vline(xintercept = c(-logFC_thr, logFC_thr), linetype = "dashed") +
geom_hline(yintercept = -log10(padj_thr), linetype = "dashed") +
theme_bw() +
xlab("log2 fold change (per 1 BMI unit)") +
ylab("-log10(p-value)") +
ggtitle("Volcano plot: association of protein expression with BMI (IMC)") +
theme(legend.position = "none")
# label top hits (by smallest adj.P.Val or largest effect)
top_to_label_bmi <- volc_df_bmi %>% arrange(adj.P.Val) %>% head(10)
p_volcano_bmi <- p_volcano_bmi +
geom_text_repel(data = top_to_label_bmi,
aes(label = Protein),
size = 3,
max.overlaps = 20)
# Show plot (optional)
print(p_volcano_bmi)# Save to file (PNG)
ggsave("results/volcano_IMC_limma.png", p_volcano_bmi, width = 8, height = 6, dpi = 300)The results seem to be really close to be sttistically significant (<0.05). Take into account that as we are checking the BMI (there is no any group information), we are able to correct by DM. This means that the differentially expressed proteins are independent to DM.
When comparing between groups it is not possible to remove DM effect completely, as the NW group has 0 cases of DM.
Finally, we wanted to evaluate differences in protein expression between predefined groups. First, a linear model is fitted that includes group as the main factor while adjusting for age, sex, and other covariates. An overall statistical test identifies proteins that vary across groups. Then, specific pairwise group comparisons are defined using contrasts.
For each contrast, the model is refitted and statistical results are extracted. A reusable function generates volcano plots for each comparison, highlighting proteins with both large expression differences and strong statistical support. The most significant proteins are labeled to facilitate biological interpretation. As output, you can also find several interactive tables with teh results of the different comparisons:
# Anova (F-test):
# Design with intercept (recommended)
design_group <- model.matrix(~ group + Edad_c + Sexo + Otros, data = meta_pat)
colnames(design_group) <- make.names(colnames(design_group))
# Verify
stopifnot(qr(design_group)$rank == ncol(design_group))
# Fit model
fit_group <- lmFit(expr_mat, design_group)
fit_group <- eBayes(fit_group, trend = TRUE, robust = TRUE)
res_overall <- topTable(fit_group, coef = grep("^group", colnames(design_group)),
number = Inf, sort.by = "F")
# Pairwise comparison:
# Define contrasts for all pairwise comparisons
contr.matrix <- makeContrasts(
OB_vs_NW = groupOB,
OW_vs_NW = groupOW,
OB_vs_OW = groupOB - groupOW,
levels = design_group
)
# Fit model
fit2 <- contrasts.fit(fit_group, contr.matrix)
fit2 <- eBayes(fit2, trend = TRUE, robust = TRUE)
tt_over_norm <- topTable(fit2, coef="OW_vs_NW", number=Inf, adjust.method = "BH")
tt_obese_norm <- topTable(fit2, coef="OB_vs_NW", number=Inf, adjust.method = "BH")
tt_obese_over <- topTable(fit2, coef="OB_vs_OW", number=Inf,adjust.method = "BH")
# --- 5. Function to make volcano plots for a given contrast
make_volcano <- function(fit2, contrast_name, logFC_thr = 0.2, padj_thr = 0.05, top_n = 10) {
res <- topTable(fit2, coef = contrast_name, number = Inf, sort.by = "P") %>%
rownames_to_column("Protein") %>%
as.data.frame()
res$significant <- with(res, (adj.P.Val < padj_thr) & (abs(logFC) > logFC_thr))
res$minusLog10P <- -log10(res$P.Value + 1e-300)
# Volcano plot
p <- ggplot(res, aes(x = logFC, y = minusLog10P)) +
geom_point(aes(color = significant), alpha = 0.7) +
scale_color_manual(values = c("grey50", "red")) +
geom_vline(xintercept = c(-logFC_thr, logFC_thr), linetype = "dashed") +
geom_hline(yintercept = -log10(padj_thr), linetype = "dashed") +
theme_bw() +
xlab("log2 fold change") +
ylab("-log10(p-value)") +
ggtitle(paste0("Volcano: ", contrast_name)) +
theme(legend.position = "none")
# Label top proteins
top_to_label <- res %>% arrange(adj.P.Val) %>% head(top_n)
p <- p + geom_text_repel(data = top_to_label,
aes(label = Protein),
size = 3,
max.overlaps = 20)
return(p)
}
# --- 6. Make volcano plots for each contrast
p_OB_vs_NW <- make_volcano(fit2, "OB_vs_NW")
p_OW_vs_NW <- make_volcano(fit2, "OW_vs_NW")
p_OB_vs_OW <- make_volcano(fit2, "OB_vs_OW")
# --- 7. Optionally save plots
ggsave("results/volcano_OB_vs_NW.png", p_OB_vs_NW, width = 8, height = 6, dpi = 300)
ggsave("results/volcano_OW_vs_NW.png", p_OW_vs_NW, width = 8, height = 6, dpi = 300)
ggsave("results/volcano_OB_vs_OW.png", p_OB_vs_OW, width = 8, height = 6, dpi = 300)Display the pairwise comparison in interactive tables:
library(DT)
library(htmltools)
tt_over_norm$proteinID <- rownames(tt_over_norm)
tt_obese_norm$proteinID <- rownames(tt_obese_norm)
tt_obese_over$proteinID <- rownames(tt_obese_over)
# Named list of topTables
tt_list <- list(
"OW_vs_NW" = tt_over_norm,
"OB_vs_NW" = tt_obese_norm,
"OB_vs_OW" = tt_obese_over
)
# Create a list of datatables
dt_list <- lapply(seq_along(tt_list), function(i) {
tt <- tt_list[[i]]
df_name <- names(tt_list)[i]
datatable(
head(tt, n = nrow(tt)),
options = list(
pageLength = 5,
initComplete = JS(
"function(settings, json) {",
"$('div.dataTables_wrapper').find('table').css('font-size', '14px');",
"$('div.dataTables_wrapper').find('caption').css({'font-size': '16px', 'font-weight': 'bold'});",
"}"
),
columnDefs = list(
list(
targets = "_all",
render = JS(
"function(data, type, row, meta) {",
"if (typeof data === 'number') {",
" if (meta.col == 5) {",
" return parseFloat(data).toExponential(2);",
" } else {",
" return parseFloat(data).toFixed(3);",
" }",
"} else {",
" return data;",
"}",
"}"
)
)
)
),
rownames = FALSE,
caption = df_name
)
})
# Render all tables at once
browsable(tagList(dt_list))In summary, we reached the following conclusions.